# This was developed on Google Colab
%env HV_DOC_HTML=true
env: HV_DOC_HTML=true
# pip install -q holoviews hvplot
# Import required libraries and dependencies
import pandas as pd
import numpy as np
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
"Resources/crypto_market_data.csv",
index_col="coin_id")
# Display sample data
df_market_data.head(10)
| price_change_percentage_24h | price_change_percentage_7d | price_change_percentage_14d | price_change_percentage_30d | price_change_percentage_60d | price_change_percentage_200d | price_change_percentage_1y | |
|---|---|---|---|---|---|---|---|
| coin_id | |||||||
| bitcoin | 1.08388 | 7.60278 | 6.57509 | 7.67258 | -3.25185 | 83.51840 | 37.51761 |
| ethereum | 0.22392 | 10.38134 | 4.80849 | 0.13169 | -12.88890 | 186.77418 | 101.96023 |
| tether | -0.21173 | 0.04935 | 0.00640 | -0.04237 | 0.28037 | -0.00542 | 0.01954 |
| ripple | -0.37819 | -0.60926 | 2.24984 | 0.23455 | -17.55245 | 39.53888 | -16.60193 |
| bitcoin-cash | 2.90585 | 17.09717 | 14.75334 | 15.74903 | -13.71793 | 21.66042 | 14.49384 |
| binancecoin | 2.10423 | 12.85511 | 6.80688 | 0.05865 | 36.33486 | 155.61937 | 69.69195 |
| chainlink | -0.23935 | 20.69459 | 9.30098 | -11.21747 | -43.69522 | 403.22917 | 325.13186 |
| cardano | 0.00322 | 13.99302 | 5.55476 | 10.10553 | -22.84776 | 264.51418 | 156.09756 |
| litecoin | -0.06341 | 6.60221 | 7.28931 | 1.21662 | -17.23960 | 27.49919 | -12.66408 |
| bitcoin-cash-sv | 0.92530 | 3.29641 | -1.86656 | 2.88926 | -24.87434 | 7.42562 | 93.73082 |
# Generate summary statistics
df_market_data.describe()
| price_change_percentage_24h | price_change_percentage_7d | price_change_percentage_14d | price_change_percentage_30d | price_change_percentage_60d | price_change_percentage_200d | price_change_percentage_1y | |
|---|---|---|---|---|---|---|---|
| count | 41.000000 | 41.000000 | 41.000000 | 41.000000 | 41.000000 | 41.000000 | 41.000000 |
| mean | -0.269686 | 4.497147 | 0.185787 | 1.545693 | -0.094119 | 236.537432 | 347.667956 |
| std | 2.694793 | 6.375218 | 8.376939 | 26.344218 | 47.365803 | 435.225304 | 1247.842884 |
| min | -13.527860 | -6.094560 | -18.158900 | -34.705480 | -44.822480 | -0.392100 | -17.567530 |
| 25% | -0.608970 | 0.047260 | -5.026620 | -10.438470 | -25.907990 | 21.660420 | 0.406170 |
| 50% | -0.063410 | 3.296410 | 0.109740 | -0.042370 | -7.544550 | 83.905200 | 69.691950 |
| 75% | 0.612090 | 7.602780 | 5.510740 | 4.578130 | 0.657260 | 216.177610 | 168.372510 |
| max | 4.840330 | 20.694590 | 24.239190 | 140.795700 | 223.064370 | 2227.927820 | 7852.089700 |
# Plot your data to see what's in your DataFrame
hvplot.extension('bokeh')
df_market_data.hvplot.line(
width=800,
height=400,
rot=90
)
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
# Create a DataFrame with the scaled data and set the coinid column as index
df_market_data_scaled = StandardScaler().fit_transform(df_market_data)
df_market_data_scaled = pd.DataFrame(df_market_data_scaled,columns=df_market_data.columns, index=df_market_data.index)
# Display sample data
df_market_data_scaled.head()
| price_change_percentage_24h | price_change_percentage_7d | price_change_percentage_14d | price_change_percentage_30d | price_change_percentage_60d | price_change_percentage_200d | price_change_percentage_1y | |
|---|---|---|---|---|---|---|---|
| coin_id | |||||||
| bitcoin | 0.508529 | 0.493193 | 0.772200 | 0.235460 | -0.067495 | -0.355953 | -0.251637 |
| ethereum | 0.185446 | 0.934445 | 0.558692 | -0.054341 | -0.273483 | -0.115759 | -0.199352 |
| tether | 0.021774 | -0.706337 | -0.021680 | -0.061030 | 0.008005 | -0.550247 | -0.282061 |
| ripple | -0.040764 | -0.810928 | 0.249458 | -0.050388 | -0.373164 | -0.458259 | -0.295546 |
| bitcoin-cash | 1.193036 | 2.000959 | 1.760610 | 0.545842 | -0.291203 | -0.499848 | -0.270317 |
# Create a list with the number of k-values from 1 to 11
k_values = np.arange(1, 12).tolist()
print(k_values)
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
# Create a function to create inertia values to store in list
def get_inertia(k, df):
k_model = KMeans(n_clusters=k, random_state=1, n_init='auto')
k_model.fit(df)
return k_model.inertia_
# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list
inertia_values = [get_inertia(k, df_market_data_scaled) for k in k_values]
# Create a DataFrame with the data to plot the Elbow curve
elbow_df = pd.DataFrame({'k': k_values, 'inertia': inertia_values})
elbow_df
| k | inertia | |
|---|---|---|
| 0 | 1 | 287.000000 |
| 1 | 2 | 212.123342 |
| 2 | 3 | 145.897940 |
| 3 | 4 | 131.457370 |
| 4 | 5 | 66.317106 |
| 5 | 6 | 57.402668 |
| 6 | 7 | 49.212644 |
| 7 | 8 | 44.799804 |
| 8 | 9 | 33.859468 |
| 9 | 10 | 29.250314 |
| 10 | 11 | 27.187162 |
# Plot a line chart with all the inertia values computed with
# the different values of k to visually identify the optimal value for k.
hvplot.extension('bokeh')
scaled_elbow = elbow_df.hvplot.line(x='k', y='inertia', xticks=k_values, width=800,
height=400, title='Scaled Elbow')
scaled_elbow
# Initialize the K-Means model using the best value for k
model_scaled = KMeans(n_clusters=5, random_state=1, n_init='auto')
# Fit the K-Means model using the scaled data
model_scaled.fit(df_market_data_scaled)
KMeans(n_clusters=5, n_init='auto', random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KMeans(n_clusters=5, n_init='auto', random_state=1)
# Predict the clusters to group the cryptocurrencies using the scaled data
prediction_scaled = model_scaled.predict(df_market_data_scaled)
# Print the resulting array of cluster values.
for i in prediction_scaled:
print(i)
1 1 2 2 1 1 1 1 1 2 0 2 2 1 2 2 2 2 1 2 0 1 2 2 2 2 2 0 1 2 2 2 3 2 0 0 4 0 2 2 0
# Add a new column to the DataFrame with the predicted clusters
df_market_data_scaled['classification_scaled'] = prediction_scaled
# Display sample data
df_market_data_scaled.head()
| price_change_percentage_24h | price_change_percentage_7d | price_change_percentage_14d | price_change_percentage_30d | price_change_percentage_60d | price_change_percentage_200d | price_change_percentage_1y | classification_scaled | |
|---|---|---|---|---|---|---|---|---|
| coin_id | ||||||||
| bitcoin | 0.508529 | 0.493193 | 0.772200 | 0.235460 | -0.067495 | -0.355953 | -0.251637 | 1 |
| ethereum | 0.185446 | 0.934445 | 0.558692 | -0.054341 | -0.273483 | -0.115759 | -0.199352 | 1 |
| tether | 0.021774 | -0.706337 | -0.021680 | -0.061030 | 0.008005 | -0.550247 | -0.282061 | 2 |
| ripple | -0.040764 | -0.810928 | 0.249458 | -0.050388 | -0.373164 | -0.458259 | -0.295546 | 2 |
| bitcoin-cash | 1.193036 | 2.000959 | 1.760610 | 0.545842 | -0.291203 | -0.499848 | -0.270317 | 1 |
# Create a scatter plot using hvPlot by setting
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`.
# Color the graph points with the labels found using K-Means and
# add the crypto name in the `hover_cols` parameter to identify
# the cryptocurrency represented by each data point.
# hvplot.extension('bokeh')
scaled_scatter = df_market_data_scaled.hvplot.scatter(x='price_change_percentage_24h', y='price_change_percentage_7d', by='classification_scaled', hover_cols='coin_id', width=800,
height=400, title='Scaled Scatter')
scaled_scatter
# Create a PCA model instance and set `n_components=3`.
pca_model = PCA(n_components=3)
# Use the PCA model with `fit_transform` to reduce to
# three principal components.
pca = pca_model.fit_transform(df_market_data_scaled.iloc[:, :-1])
pca_df = pd.DataFrame(pca, columns=['PC1','PC2','PC3'], index=df_market_data_scaled.index)
# View the first five rows of the DataFrame.
display(pca_df.head())
| PC1 | PC2 | PC3 | |
|---|---|---|---|
| coin_id | |||
| bitcoin | -0.600667 | 0.842760 | 0.461595 |
| ethereum | -0.458261 | 0.458466 | 0.952877 |
| tether | -0.433070 | -0.168126 | -0.641752 |
| ripple | -0.471835 | -0.222660 | -0.479053 |
| bitcoin-cash | -1.157800 | 2.041209 | 1.859715 |
# Retrieve the explained variance to determine how much information
# can be attributed to each principal component.
print(pca_model.explained_variance_ratio_)
print(sum(pca_model.explained_variance_ratio_))
[0.3719856 0.34700813 0.17603793] 0.8950316570309842
Question: What is the total explained variance of the three principal components?
Answer: 0.895031657030984
# Plot a line chart with all the inertia values computed with
# the different values of k to visually identify the optimal value for k.
pca_inertia_values = [get_inertia(k, pca_df) for k in k_values]
# Create a DataFrame with the data to plot the Elbow curve
pca_elbow_df = pd.DataFrame({'k': k_values, 'inertia': pca_inertia_values})
hvplot.extension('bokeh')
pca_elbow = pca_elbow_df.hvplot.line(x='k', y='inertia', xticks=k_values, width=800,
height=400, title='PCA Elbow')
pca_elbow
Question: What is the best value for k when using the PCA data?
4Question: Does it differ from the best k value found using the original data?
Yes# Initialize the K-Means model using the best value for k
model_pca = KMeans(n_clusters=4, random_state=1, n_init='auto')
# Fit the K-Means model using the PCA data
model_pca.fit(df_market_data_scaled.iloc[:,:-1])
KMeans(n_clusters=4, n_init='auto', random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KMeans(n_clusters=4, n_init='auto', random_state=1)
# Predict the clusters to group the cryptocurrencies using the PCA data
prediction_pca = model_pca.predict(df_market_data_scaled.iloc[:,:-1])
# Print the resulting array of cluster values.
for i in prediction_pca:
print(i)
1 1 2 2 1 1 1 1 2 2 0 2 2 1 2 2 2 2 1 2 0 1 2 2 2 2 2 0 1 2 2 2 3 2 0 0 1 0 2 2 0
# Add a new column to the DataFrame with the predicted clusters
df_market_data_scaled['classification_pca'] = prediction_pca
# Display sample data
df_market_data_scaled.head()
| price_change_percentage_24h | price_change_percentage_7d | price_change_percentage_14d | price_change_percentage_30d | price_change_percentage_60d | price_change_percentage_200d | price_change_percentage_1y | classification_scaled | classification_pca | |
|---|---|---|---|---|---|---|---|---|---|
| coin_id | |||||||||
| bitcoin | 0.508529 | 0.493193 | 0.772200 | 0.235460 | -0.067495 | -0.355953 | -0.251637 | 1 | 1 |
| ethereum | 0.185446 | 0.934445 | 0.558692 | -0.054341 | -0.273483 | -0.115759 | -0.199352 | 1 | 1 |
| tether | 0.021774 | -0.706337 | -0.021680 | -0.061030 | 0.008005 | -0.550247 | -0.282061 | 2 | 2 |
| ripple | -0.040764 | -0.810928 | 0.249458 | -0.050388 | -0.373164 | -0.458259 | -0.295546 | 2 | 2 |
| bitcoin-cash | 1.193036 | 2.000959 | 1.760610 | 0.545842 | -0.291203 | -0.499848 | -0.270317 | 1 | 1 |
# Create a scatter plot using hvPlot by setting
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`.
# Color the graph points with the labels found using K-Means and
# add the crypto name in the `hover_cols` parameter to identify
# the cryptocurrency represented by each data point.
# hvplot.extension('bokeh')
pca_scatter = df_market_data_scaled.hvplot.scatter(x='price_change_percentage_24h', y='price_change_percentage_7d', by='classification_pca', hover_cols='coin_id',
width=800, height=400, title='PCA Scatter')
pca_scatter
In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.
# Composite plot to contrast the Elbow curves
# hvplot.extension('bokeh')
scaled_elbow + pca_elbow
# Composite plot to contrast the clusters
# hvplot.extension('bokeh')
scaled_scatter + pca_scatter
Question: After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?
Answer: There wasn't much of a difference between the scaled cluster and the PCA cluster other than identifying/classifying a significant outlier. Personally, the PCA Cluster included an extra data point into the second classification vs the scaled method. The PCA method appears to have performed a more concise visual classifcation of the data provided.